{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "Reducing Overfitting with Cross-Validation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#Use within an Jupyter notebook\n",
    "%matplotlib inline   \n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "from sklearn.datasets import load_diabetes\n",
    "\n",
    "diabetes = load_diabetes()\n",
    "\n",
    "X = diabetes.data\n",
    "y = diabetes.target\n",
    "\n",
    "X_feature_names = ['age', 'gender', 'body mass index', 'average blood pressure','bl_0','bl_1','bl_2','bl_3','bl_4','bl_5']\n",
    "\n",
    "\n",
    "bins = 50*np.arange(8)\n",
    "binned_y = np.digitize(y, bins)\n",
    "\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=binned_y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "GridSearchCV(cv=10, error_score='raise',\n",
       "       estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,\n",
       "           max_leaf_nodes=None, min_impurity_decrease=0.0,\n",
       "           min_impurity_split=None, min_samples_leaf=1,\n",
       "           min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
       "           presort=False, random_state=None, splitter='best'),\n",
       "       fit_params=None, iid=True, n_jobs=1,\n",
       "       param_grid={'max_depth': [3, 5, 7, 9, 20]}, pre_dispatch='2*n_jobs',\n",
       "       refit=True, return_train_score=True, scoring=None, verbose=0)"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.tree import DecisionTreeRegressor\n",
    "\n",
    "dtr = DecisionTreeRegressor()\n",
    "\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "\n",
    "gs_inst = GridSearchCV(dtr, param_grid = {'max_depth': [3,5,7,9,20]},cv=10)\n",
    "gs_inst.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DecisionTreeRegressor(criterion='mse', max_depth=3, max_features=None,\n",
       "           max_leaf_nodes=None, min_impurity_decrease=0.0,\n",
       "           min_impurity_split=None, min_samples_leaf=1,\n",
       "           min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
       "           presort=False, random_state=None, splitter='best')"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gs_inst.best_estimator_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "50.811508414171186"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_pred = gs_inst.predict(X_test)\n",
    "\n",
    "from sklearn.metrics import mean_absolute_error\n",
    "mean_absolute_error(y_test, y_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.44293644466332444"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import numpy as np\n",
    "(np.abs(y_test - y_pred)/(y_test)).mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from sklearn import tree\n",
    "from sklearn.externals.six import StringIO\n",
    "\n",
    "import pydot\n",
    "from IPython.display import Image\n",
    "\n",
    "dot_diabetes = StringIO()\n",
    "tree.export_graphviz(gs_inst.best_estimator_, out_file = dot_diabetes, feature_names = X_feature_names)\n",
    "graph = pydot.graph_from_dot_data(dot_diabetes.getvalue())"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
